#Import libraries
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(wordcloud2)
library(reshape)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(stringr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:reshape':
##
## rename
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#Reads in housing data
housing <- read.csv('housing_cleaned.csv')
#Creates list of columns in df
housing_columns = colnames(housing)
#Drops 'X' column from df
housing_columns = housing_columns[-1]
housing_columns = housing_columns[-11]
housing = subset(housing, select = housing_columns)
#Sub-Categories
#Number/types of sub-categories-bar chart
#Creates a bar graph of sub-category types
sub_cat_counts <- function(state) {
if(state=="All Sample States") {
sub_cats <- ggplot(housing, aes(x=Sub.categories)) + geom_bar(fill="blue") + labs(x="Sub-Category", y="Counts", title="Types of Sub-Categories") + theme(axis.text.x = element_text(angle = 25))
ggplotly(sub_cats)
}
else { #fix
State <- str_to_title(state)
state_input <- housing[housing[, "State"]==state, ]
state_df <- data.frame()
state_df <- rbind(state_df, state_input)
sub_cats <- ggplot(state_df, aes(x=Sub.categories)) + geom_bar(fill="blue") + labs(x="Sub-Category", y="Counts", title="Types of Sub-Categories") + theme(axis.text.x = element_text(angle = 25))
ggplotly(sub_cats)
}
}
sub_cat_counts("All Sample States")
#Tool
#Number/types of tools-pie chart
#fix
tool_barchart <- function(state) {
if(state=="All Sample States") {
tools <- ggplot(housing, aes(x=Tool)) + geom_bar(fill="blue") + labs(x="Tool", y="Count", title="Types of Tools") + theme(axis.text.x = element_text(angle = 45))
ggplotly(tools)
}
else {
State <- str_to_title(state)
state_input <- housing[housing[, "State"]==state, ]
state_df <- data.frame()
state_df <- rbind(state_df, state_input)
tools <- ggplot(state_df, aes(x=Tool)) + geom_bar(fill="blue") + labs(x="Tool", y="Count", title="Types of Tools") + theme(axis.text.x = element_text(angle = 45))
ggplotly(tools)
}
}
tool_barchart("California")
#Tool Name
#Word cloud for tool names
#Input is designated state
tool_cloud <- function(state) {
if(state=="all states"){
combo <- ""
for (i in 1:nrow(housing)) {
combo <- paste(combo, housing$Tool.Name[i], sep="")
}
}
else{
#Combines all variables into one string
combo <- ""
for (i in 1:nrow(housing)) {
if(housing$State[i]==state) {
combo <- paste(combo, housing$Tool.Name[i], sep="")
}
}
}
#Turns string into corpus of words
docs <- Corpus(VectorSource(combo))
#Cleaning of corpus
docs <- docs %>% tm_map(removeNumbers) %>% tm_map(removePunctuation) %>% tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))
#Turns corpus into term-document-matrix
dtm <- TermDocumentMatrix(docs)
mtx <- as.matrix(dtm)
words <- sort(rowSums(mtx), decreasing = TRUE)
df <- data.frame(word = names(words), freq=words)
#Creates wordcloud
set.seed(33)
cloud <- wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words = 100, random.order = FALSE, rot.per = 0, colors = brewer.pal(4, "Set1"))
return(cloud)
}
#Variables
#Word cloud for variable names
#Input is designated state
variable_cloud <- function(state) {
if(state=="all states"){
combo <- ""
for (i in 1:nrow(housing)) {
combo <- paste(combo, housing$Variables[i], sep="")
}
}
else{
#Combines all variables into one string
combo <- ""
for (i in 1:nrow(housing)) {
if(housing$State[i]==state) {
combo <- paste(combo, housing$Variables[i], sep="")
}
}
}
#Turns string into corpus of words
docs <- Corpus(VectorSource(combo))
#Cleaning of corpus
docs <- docs %>% tm_map(removeNumbers) %>% tm_map(removePunctuation) %>% tm_map(stripWhitespace)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("english"))
#Turns corpus into term-document-matrix
dtm <- TermDocumentMatrix(docs)
mtx <- as.matrix(dtm)
words <- sort(rowSums(mtx), decreasing = TRUE)
df <- data.frame(word = names(words), freq=words)
#Creates wordcloud
set.seed(33)
cloud <- wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words = 100, random.order = FALSE, rot.per = 0, colors = brewer.pal(4, "Set1"))
return(cloud)
}
#Geographic Levels
#Number/types of geographic levels-pie chart
#fix
geo <- ggplot(housing, aes(x=Geographic.Levels)) + geom_bar(fill="blue") + labs(x="Geographic Levels", y="Count", title="Types of Geoographic Levels") + theme(axis.text.x = element_text(angle = 45))
ggplotly(geo)
#Data Sources-Census
#Number/types of census sources-bar chart, pie chart, count
#fix: exclude if value is NA
#fix: by state
census <- ggplot(housing, aes(x=Data.Sources.Census)) + geom_bar(fill="blue") + labs(x="Census Source", y="Count", title="Types of Census Sources") + theme(axis.text.x = element_text(angle = 45))
ggplotly(census)
#Data Sources-Non Census
#Number/types of non-census sources-bar chart, pie chart, count
#fix: exclude if value is NA
#fix: by state
non_census <- ggplot(housing, aes(x=Data.Sources.Non.Census)) + geom_bar(fill="blue") + labs(x="Data Sources", y="Count", title="Data Sources") + theme(axis.text.x = element_text(angle = 45))
ggplotly(non_census)
#Direct Links to Census
#Count
sum(!housing$Direct.links.to.Census=='n')
## [1] 166
#Age of Data
#Number/types of age of data-pie chart
#Oldest data, most recent
#Historical Data
#Count
sum(housing$Historical.data=='y')
## [1] 0